clear all
global main "/Users/inga/Dropbox/Ais project/Repository"
cd "$main/1_AIS_data_summstats"

global data "$main/data_intermediate"
global source "$main/data_input"
*global source_1 "../../Get ship_port_to_port.dta/midstep Data (may needed later)"   // 	TO BE FIXED
*global source_2 "../../Get ship_port_to_port.dta"   								// 	TO BE FIXED




*global data "data"
*global matlab "MATLAB"
*global figures "figures"
 


*** THIS DO FILE IS STRUCTURED AS FOLLOWS

* 1) generate new direct travel time dataset based on trips between actual (non-anchorage, non-pure-transit ports), using 
**** only trips that did not involve a stop at an anchorage port in between  --> triplist_na.dta

* 2) find port clusters by aggregating all ports within a 30km radius of at least one cluster member
**** this step involves MATLAB and its graph package; see NetworkCleaning.m
 
* 3) IMPOSE SAMPLE RESTRICTIONS, CHECK HOW MUCH OF GLOBALLY SHIPPED VOLUME WE LOSE, GENERATE NEW PORT-TO-PORT DATASET INCLUDING INDICATOR FOR 
**** WHICH PORT BELONGS TO WHICH CLUSTER FOR A BALANCED SAMPLE OF PORTS (all ports that both departure and arrival ports) +++ ->  ship_port_to_port_wclusters.dta



** input files from Yuan

* ais_container_cleaned.dta
* ship_all_variables.dta
* ship_port_to_port.dta


*** previous version of this do-file: 01_dataprep_BoxAccounting.do



************************************************************************************************************************************************************************************
************************ 1) find trips that do not involve stops in anchorage ports and compute direct travel time at the port-to-port level ***************************************  
************************************************************************************************************************************************************************************

*use "$source_1/ais_container_cleaned.dta", clear
use "$data/ais_container_cleaned.dta", clear


* merge scantling draught information
*merge m:1 ship_id_new using "$source_1/ship_all_variables.dta", keepusing(draughtm draught_max)
merge m:1 ship_id_new using "$data/ship_all_variables.dta", keepusing(draughtm draught_max)
drop if _merge==2
drop _merge
replace draughtm=10*draughtm
egen draught_s=rowmax(draughtm draught_max)
drop draughtm draught_max


* define trips starting and ending in regular non-anchorage non-transit port
g start = 1 if port_type == "P" & move_type == "departure" & inTransit=="false"
sort ship_id time
bysort ship_id: g ftrip = sum(start)

g ballast = 1 if new_draught<.55*draught_s 


* tag and drop trips involving STOPS at anchorage port
g anchor = 1 if port_type == "A" &  inTransit == "false" & ballast==1
egen t_anchor = total(anchor), by(ftrip ship_id)
drop if t_anchor > 0
drop *anchor start trip travel_hs

*drop transit ports
drop if inTransit == "true"


*drop non-ballast anchorages
drop if port_type=="A" & ballast==.

*eliminate trips starting with arrival in a regular port, rather than departure
bysort ship_id: drop if move_type == "arrival" & ship_id[_n-1]==.

*eliminate trips ending with departure rather than arrival
bysort ship_id: drop if move_type == "departure" & ship_id[_n+1]==.



*** check: every trip should now consist of exactly two observations: departure from regular port and arrival in regular port
tab port_type
egen count = count(ftrip), by(ftrip ship_id)
summ count
drop count
***


*reshape dataset to wide: each observation is a trip
*drop inTransit time* port_type
drop inTransit time* port_type ballast draught_s

replace move_type = "D" if move_type =="departure"
replace move_type = "A" if move_type =="arrival"

reshape wide port_id port_name new lon lat country date, i(ship_id ftrip) j(move_type) string
rename *D D_*
rename *A A_*
drop ftrip

drop if A_port_id == D_port_id

*compute travel hours and distance
g travel_hs=hours(A_date-D_date)

geodist A_lat A_lon D_lat D_lon, g(distance)

save "$data/triplist_na", replace




***********************************************************************************************************************
*********************** 2) find ports clusters *************************************************************************
***********************************************************************************************************************


*g list of port pairs at smaller distance than 30km

*u "$source_2/ship_port_to_port", clear

u "$data/ship_port_to_port", clear

geodist A_lat A_lon D_lat D_lon, g(distance)

*u $data/triplist_na2, clear
keep *name *id distance *country *lat *lon
keep if distance<30 & A_country == D_country
duplicates drop A_port_id D_port_id, force

preserve
keep A*
duplicates drop
save "$data/A", replace
restore

keep D*
rename D* A*
duplicates drop
append using "$data/A"
duplicates drop

* new id
sort A_port_id
g iid = _n
save "$data/pid", replace

*u "$source_2/ship_port_to_port", clear
u "$data/ship_port_to_port", clear
geodist A_lat A_lon D_lat D_lon, g(distance)

keep if distance<30 & A_country == D_country
keep *id
duplicates drop

merge m:1 A_por using "$data/pid", keepusing(iid)
drop if _merge==2
drop _merge
rename iid aid
drop A_p 
rename D_p A_port_id
merge m:1 A_por using "$data/pid", keepusing(iid)
drop if _merge==2
drop _merge
rename iid did
drop *_id


** network methodology is used to find clusters

* drop duplicates in list of undirected edges
sort aid did
egen max = rowmax(aid did)
egen min = rowmin(aid did)
tostring max min, replace
g pid = max + "_" + min
duplicates drop pid, force

* export to MATLAB

outfile aid did using "$data/network_close.csv", comma wide replace

erase "$data/A.dta"


*##### run PortClusters.m


** import from MATLAB

* row indicates port, cell entry indicates cluster

import delim using "$data/connectedcomp.csv", clear
g iid = _n 
rename v1 cluster
merge 1:1 iid using "$data/pid"
drop _merge iid
save "$data/port_cluster", replace




******************************************************************************************************************************************
************ make a dataset with more meaningful port names for the clusters and coordinates for cluster's center ************************
******************************************************************************************************************************************
*** use name and coordinates of biggest port



u "$data/ship_port_to_port", clear
collapse (sum) IN=dwt, by(A_port_id A_port_name A_country A_lat A_lon)
save "$data/IN_A", replace

u "$data/ship_port_to_port", clear
collapse (sum) OUT=dwt, by(D_port_id D_port_name D_country D_lat D_lon)

rename D_* A_*
merge 1:1 A_port_id using "$data/IN_A", update
drop _merge

egen INOUT = rowtotal(IN OUT)

merge m:1 A_port_id using "$data/port_cluster"  /*_merge==2 observations are ports which do not have nearby neighbors (did not enter the clustering exercise)*/
drop _merge

egen max = max(INOUT) if cluster!=. , by(cluster)

g cl_lat = A_lat if INOUT==max
g cl_lon = A_lon if INOUT==max
g cl_name = A_port_name + "*" if INOUT == max

gsort cluster -INOUT
by cluster: carryforward cl_lat cl_lon cl_name, replace

replace A_port_name = cl_name if cluster!=.
replace A_lat = cl_lat if cluster!=.
replace A_lon = cl_lon if cluster!=.

keep A_*

save "$data/port_cluster_name", replace
erase "$data/IN_A.dta"














********************************************************************************************************************************************

* 3) IMPOSE SAMPLE RESTRICTIONS, CHECK HOW MUCH OF GLOBALLY SHIPPED VOLUME WE LOSE, GENERATE NEW PORT-TO-PORT DATASET INCLUDING INDICATOR FOR 
**** WHICH PORT BELONGS TO WHICH CLUSTER
 
* a) use only ships with dwt > 15800, corresponding to roughly 1000 TEU
* b) mimimum # ships per route



u "$data/triplist_na", clear

*** merge ship specifics

merge m:1 ship_id using "$data/ship_all_variables.dta", keepusing(teu dwt)
drop if _merge==2
drop _merge

rename A_* AA_*
rename D_* A_*
merge m:1 A_port_id using "$data/port_cluster_name", update replace
drop if _merge==2
drop _merge 

rename A_* D_*
rename AA_* A_*

merge m:1 A_port_id using "$data/port_cluster_name", update replace
drop if _merge==2
drop _merge 


drop if dwt<15800
drop if trav==0

drop if A_port_name == D_port_name


egen freq = count(ship_id), by(A_port_name D_port_name *country)


drop if freq<5

drop freq*
collapse (median) travel (count) freq = ship_id, by(A_port_name D_port_name *country *lat *lon)



geodist A_lat A_lon D_lat D_lon, g(distance)

scatter travel dist, scale(.3)

save "$data/travel_hours_clusters", replace


erase "$data/port_cluster.dta"






************************************************************************************************************************
************************************ generate stopping time ************************************************************


u "$data/ship_port_to_port", clear


rename A_* AA_*
rename D_* A_*
merge m:1 A_port_id using "$data/port_cluster_name", update replace
drop if _merge==2
drop _merge 

rename A_* D_*
rename AA_* A_*

merge m:1 A_port_id using "$data/port_cluster_name", update replace
drop if _merge==2
drop _merge 

drop if A_port_name == D_port_name

* distinct ports after forming clusters of ports within 30km radius


merge m:1 A_port_name D_port_name *country using "$data/travel_hours_clusters"
keep if _merge==3
drop _merge


sort ship_id A_date

bysort ship_id: g stoptime = hours(D_date[_n+1]-A_date)
drop if stoptime==.

g stoptime2 = stoptime if ballast_55!=1 | ballast_55[_n+1]==1

collapse (median) stoptime stoptime2 (count) freq=stoptime, by(A_country A_port_name)
sort stoptime
replace stoptime=stoptime2 if stoptime2!=.
drop stoptime2 freq

save "$data/stoptime", replace



u "$data/travel_hours_clusters", clear

merge m:1 A_port_name A_country using "$data/stoptime"
g A_stoptime=stoptime/2
drop _merge stoptime
rename A_* AA_*
rename D_* A_*
merge m:1 A_port_name A_country using "$data/stoptime"
g A_stoptime=stoptime/2
drop _merge stoptime
rename A_* D_* 
rename AA_* A_*

g travel_stop_hs = travel_hs+A_stoptime+D_stoptime
save "$data/travel_stop_hours_clusters", replace








*****************************************************************************************************************
******************     generate new ship_port_to_port data with clustered ports *********************************



u "$data/ship_port_to_port", clear

drop travel_hs /*this measure is replace by the correct #hours below*/

rename A_* AA_*
rename D_* A_*
merge m:1 A_port_id using "$data/port_cluster_name", update replace
drop if _merge==2
drop _merge 

rename A_* D_*
rename AA_* A_*

merge m:1 A_port_id using "$data/port_cluster_name", update replace
drop if _merge==2
drop _merge 

*find pairs which will be in the new dataset
merge m:1 A_port_name D_port_name *country using "$data/travel_stop_hours_clusters", keepusing(freq)
drop if _merge==2


/* I keep all observations for now in order not to break the trip structure */
/* dwt on routes that we won't use will be set to zero in the next step*/

g RestrSample = 1 if _merge==3 | (D_port_name == A_port_name & D_country==A_country)
drop _merge
g travel_hs=hours(A_date-D_date)


save "$data/ship_port_to_port_wclusters", replace






******************************************************************************************
***************** make balanced port data set ********************************************
***** use only ports which appear as A_ports and D_ports after imposing the sample *******
***** + dropping ships in ballast
***** restrictions. This is done in a few iterations *************************************



u "$data/ship_port_to_port_wclusters", clear
drop if RestrSample!=1
drop if A_co == D_co & A_port_name == D_port_name
drop if ballast_55==1
keep A_port_name A_co
duplicates drop
save "$data/a", replace

u "$data/ship_port_to_port_wclusters", clear
drop if RestrSample!=1
drop if A_co == D_co & A_port_name == D_port_name
drop if ballast_55==1
keep D_port_name D_co
duplicates drop
rename D_* A_*
merge 1:1 A_c A_p using "$data/a"
keep if _merge==3
drop _merge
save "$data/port_id_list_balanced1", replace

u "$data/ship_port_to_port_wclusters", clear
drop if RestrSample!=1
drop if A_co == D_co & A_port_name == D_port_name
drop if ballast_55==1
merge m:1 A_port_name A_country using "$data/port_id_list_balanced1"
drop if _merge==1
drop _merge
rename A_port_name AA_port_name
rename A_country AA_country
rename D_port_name A_port_name
rename D_country A_country
merge m:1 A_port_name A_country using "$data/port_id_list_balanced1"
keep if _merge==3
rename A_country D_country
rename A_port_name D_port_name
rename AA_* A_*
save "$data/ship_cl2", replace

keep A_port_name A_co
duplicates drop
save "$data/a2", replace

u "$data/ship_cl2", clear
keep D_port_name D_co
duplicates drop
rename D_* A_*
merge 1:1 A_c A_p using "$data/a2"
keep if _merge==3
drop _merge
save "$data/port_id_list_balanced2", replace

u "$data/ship_port_to_port_wclusters", clear
drop if RestrSample!=1
drop if A_co == D_co & A_port_name == D_port_name
drop if ballast_55==1
merge m:1 A_port_name A_country using "$data/port_id_list_balanced2"
keep if _merge==3
drop _merge
rename A_port_name AA_port_name
rename A_country AA_country
rename D_port_name A_port_name
rename D_country A_country
merge m:1 A_port_name A_country using "$data/port_id_list_balanced2"
keep if _merge==3
rename A_country D_country
rename A_port_name D_port_name
rename AA_* A_*
save "$data/ship_cl3", replace

keep A_port_name A_co
duplicates drop
save "$data/a3", replace

u "$data/ship_cl3", clear
keep D_port_name D_co
duplicates drop
rename D_* A_*
merge 1:1 A_c A_p using "$data/a3"
keep if _merge==3
drop _merge
sort A_port_name A_country
egen A_acid = group(A_port_name A_c)
save "$data/port_id_list_balanced_small", replace

rename A_* D_*
cross using "$data/port_id_list_balanced_small"
save "$data/pid", replace


u "$data/ship_port_to_port_wclusters", clear
merge m:1 A_country A_port_name D_port_name D_coun using "$data/pid"
keep if _merge==3
drop *60 *50 *65
drop if RestrSample!=1
drop if A_co == D_co & A_port_name == D_port_name
drop if ballast_55==1
drop _merge

save "$data/port_to_port_ship_bal_clus_small", replace





*** make list of ports in sample with specifics
u "$data/port_id_list_balanced_small", clear
merge 1:m A_port_name A_country using "$data/port_cluster_name"
drop if _merge==2
collapse (firstnm) *lat *lon, by(A_acid A_port_name A_co)
save "$data/port_list_sample_small", replace



*** make datasets by port: tonnes going out and coming in 

u "$data/port_to_port_ship_bal_clus_small", clear
replace dwt_u=0 if A_acid == D_acid
drop if A_acid==. | D_acid==.

collapse (sum) dwt_usage, by(D_acid)
rename dwt x_j
rename D A_acid
save "$data/xj2_small", replace



u "$data/port_to_port_ship_bal_clus_small", clear
replace dwt_u=0 if A_acid == D_acid
drop if A_acid==. | D_acid==.
collapse (sum) dwt_usage, by(A_acid)
save "$data/IN_A_acid_small", replace


**** make bilateral dataset with port names

u "$data/port_list_sample_small", clear
rename A_* D_*
cross using "$data/port_list_sample_small"
save "$data/port_to_port_sample_small", replace






u "$data/ship_port_to_port_wclusters", clear
drop if RestrSample!=1
drop if A_co == D_co & A_port_name == D_port_name
drop  *_port_id
merge m:1 *port_name *country using "$data/port_to_port_sample_small"
keep if _merge==3
drop _merge
save "$data/ship_port_to_port_final", replace






**** make dataset with share of tonnes per trip transported on NEOPANAMAX ships



u "$data/port_to_port_ship_bal_clus_small", clear
drop if A_acid == D_acid
drop if A_acid==. | D_acid==.
drop if Restr!=1

g NEOP=1 if teu>5100 & teu<14501
replace NEOP=0 if NEOP==.

g A_day=dofc(A_date)-20453
g pre=0
replace pre = 1 if A_day<184

g teuNEOP=teu*NEOP
g teuNEOP_pre=teu*NEOP*pre
g teu_pre=pre*teu

collapse (sum) teu teu_pre teuNE*, by(A_acid D_acid)

save "$data/teuNEOP_small", replace






erase "$data/a.dta" 
erase "$data/a2.dta" 
erase "$data/a3.dta"
erase "$data/port_id_list_balanced1.dta"
erase "$data/port_id_list_balanced2.dta"
erase "$data/ship_cl3.dta"
erase "$data/ship_cl2.dta"
erase "$data/pid.dta" 









